## 红楼梦文本挖掘之数据预处理####
## 主要用于文本文档的读取和构建
## 分析与挖掘R中的人物关系
## 红楼梦中关键人物的社交网络
## 孙玉林;2016年10月31


## 如果在每个段落中人物同时出现,则频数权重加1

## 加载所需要的包
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggplot2)
library(gridExtra)
library(networkD3)

## 读取数据
## 一共有149人出现的频次多余10次
Red_net <- read.csv("./数据/社交网络权重.csv")
Red_net[,1:2] <- apply(Red_net[,1:2],2,as.character)
Name_freq <- read.csv("./数据/红楼梦人物出现频次.csv")
Name_freq <- Name_freq[Name_freq$word %in%(union(unique(Red_net$First),
                                                 unique(Red_net$Second))),]
Name_freq$word <- as.character(Name_freq$word)
# union(unique(Red_net$First),unique(Red_net$Second))

## 可视化人出现的频次
p1 <- ggplot(Name_freq,aes(x = reorder(word,freq),y = freq)) +
  theme_bw(base_size = 9,base_family = "STKaiti")+
  geom_bar(stat = "identity",position = "dodge",fill = "lightblue") +
  theme(axis.text.x = element_text(size = 5,hjust = 1,angle = 90,vjust = 0.5),
        axis.title.x = element_blank()) +
  labs(x = "",y = "频数",title = "《红楼梦》中关键人物出现次数")

p2 <- ggplot(Name_freq[Name_freq$freq>80,],aes(x = reorder(word,freq),y = freq)) +
  theme_bw(base_size = 9,base_family = "STKaiti")+
  geom_bar(stat = "identity",position = "dodge",fill = "lightblue") +
  theme(axis.text.x = element_text(size = 9,hjust = 1,angle = 90,vjust = 0.5)) +
  labs(x = "人名",y = "频数")

grid.arrange(p1,p2,nrow = 2)

## 按照权重1,即章节权重分析人物的社交网络####
# 1:准备社交网络数据
chap_net <- Red_net[Red_net$chapweight > 10,c(1,2,3)]
names(chap_net) <- c("from","to","weight")

chap_vert <- Name_freq[Name_freq$word %in% as.character(union(unique(chap_net$from),
                   unique(chap_net$to))),]

chap_net <- graph_from_data_frame(chap_net,directed = FALSE,
                                  vertices = chap_vert)
chap_net
## IGRAPH UNW- 69 762 -- 
## + attr: name (v/c), freq (v/n), weight (e/n)
## + edges (vertex names):
##  [1] 宝玉--贾母       宝玉--凤姐       宝玉--袭人       宝玉--王夫人    
##  [5] 宝玉--宝钗       宝玉--贾政       宝玉--贾琏       宝玉--平儿      
##  [9] 宝玉--薛姨妈     宝玉--探春       宝玉--紫鹃       宝玉--鸳鸯      
## [13] 宝玉--贾珍       宝玉--李纨       宝玉--尤氏       宝玉--晴雯      
## [17] 宝玉--邢夫人     宝玉--薛蟠       宝玉--林黛玉     宝玉--香菱      
## [21] 宝玉--麝月       宝玉--贾蓉       宝玉--贾赦       宝玉--惜春      
## [25] 宝玉--贾芸       宝玉--周瑞家的   宝玉--芳官       宝玉--贾环      
## [29] 宝玉--妙玉       宝玉--雪雁       宝玉--迎春       宝玉--赵姨娘    
## + ... omitted several edges
chap_net$name <- "《红楼梦》章节人物关系"
V(chap_net)$media
## NULL
## 节点数目
vcount(chap_net)
## [1] 69
## 边的数目
ecount(chap_net)
## [1] 762
## 简化网络图
chap_net <- simplify(chap_net,remove.multiple = TRUE,remove.loops = TRUE,
                     edge.attr.comb = "mean")

## 查看节点的度
degrees <- data.frame(name = names(degree(chap_net)),
                      counts = (degree(chap_net)))

ggplot(degrees,aes(x = reorder(name,counts),y = counts)) +
  theme_bw(base_size = 11,base_family = "STKaiti")+
  geom_bar(stat = "identity",position = "dodge",fill = "lightblue") +
  theme(axis.text.x = element_text(size = 5,hjust = 1,angle = 90,vjust = 0.5),
        axis.title.x = element_blank()) +
  labs(x = "人名",y = "节点的度",title = "《红楼梦》")

## 判断事否为联通图
is.connected(chap_net)
## [1] TRUE
## 计算图的直径
diameter(chap_net,directed = FALSE)
## [1] 48
## 
set.seed(1234)
par(cex = 0.8,family = "STKaiti")
## 设置图层
layout1 <- layout.lgl(chap_net)
layout2 <- layout.kamada.kawai(chap_net)
layout3 <- layout.reingold.tilford(chap_net)
layout4 <- layout.fruchterman.reingold(chap_net)

#V(chap_net)$size <- Name_freq$freq/10
## 设置节点的字体
V(chap_net)$label.family <- "STKaiti"
E(chap_net)$width <- log10(E(chap_net)$weight) *2
egam <- (E(chap_net)$width) / max(E(chap_net)$width)
E(chap_net)$color <- rgb(1,0.5,0.5,egam)
V(chap_net)$size <- log(V(chap_net)$freq) * 2.5
plot(chap_net,layout = layout1,main = "《红楼梦》根据章节部分人物关系")

plot(chap_net,layout = layout2,main = "《红楼梦》根据章节部分人物关系")

plot(chap_net,layout = layout3,main = "《红楼梦》根据章节部分人物关系")

plot(chap_net,layout = layout4,main = "《红楼梦》根据章节部分人物关系")

## 按照权重2,即段落权重分析人物的社交网络####
# 1:准备社交网络数据
chap_net <- Red_net[Red_net$duanweight > 10,c(1,2,4)]
names(chap_net) <- c("from","to","weight")

chap_vert <- Name_freq[Name_freq$word %in% as.character(union(unique(chap_net$from),
                                                              unique(chap_net$to))),]

chap_net <- graph_from_data_frame(chap_net,directed = FALSE,
                                  vertices = chap_vert)
chap_net
## IGRAPH UNW- 77 398 -- 
## + attr: name (v/c), freq (v/n), weight (e/n)
## + edges (vertex names):
##  [1] 宝玉--贾母   宝玉--凤姐   宝玉--袭人   宝玉--王夫人 宝玉--宝钗  
##  [6] 宝玉--贾政   宝玉--贾琏   宝玉--平儿   宝玉--薛姨妈 宝玉--探春  
## [11] 宝玉--紫鹃   宝玉--鸳鸯   宝玉--贾珍   宝玉--李纨   宝玉--尤氏  
## [16] 宝玉--晴雯   宝玉--刘姥姥 宝玉--邢夫人 宝玉--薛蟠   宝玉--林黛玉
## [21] 宝玉--香菱   宝玉--麝月   宝玉--贾蓉   宝玉--贾赦   宝玉--惜春  
## [26] 宝玉--贾芸   宝玉--芳官   宝玉--贾环   宝玉--妙玉   宝玉--雪雁  
## [31] 宝玉--迎春   宝玉--赵姨娘 宝玉--莺儿   宝玉--秦钟   宝玉--巧姐  
## [36] 宝玉--秋纹   宝玉--贾兰   宝玉--茗烟   宝玉--史湘云 宝玉--大了  
## + ... omitted several edges
chap_net$name <- "《红楼梦》章节人物关系"
V(chap_net)$media
## NULL
## 节点数目
vcount(chap_net)
## [1] 77
## 边的数目
ecount(chap_net)
## [1] 398
## 简化网络图
chap_net <- simplify(chap_net,remove.multiple = TRUE,remove.loops = TRUE,
                     edge.attr.comb = "mean")

## 查看节点的度
degrees <- data.frame(name = names(degree(chap_net)),
                      counts = (degree(chap_net)))

ggplot(degrees,aes(x = reorder(name,counts),y = counts)) +
  theme_bw(base_size = 11,base_family = "STKaiti")+
  geom_bar(stat = "identity",position = "dodge",fill = "lightblue") +
  theme(axis.text.x = element_text(size = 5,hjust = 1,angle = 90,vjust = 0.5),
        axis.title.x = element_blank()) +
  labs(x = "人名",y = "节点的度",title = "《红楼梦》")

## 判断事否为联通图
is.connected(chap_net)
## [1] TRUE
## 计算图的直径
diameter(chap_net,directed = FALSE)
## [1] 87
## 
set.seed(1234)
par(cex = 0.8,family = "STKaiti")
## 设置图层
layout1 <- layout.lgl(chap_net)
layout2 <- layout.kamada.kawai(chap_net)
layout3 <- layout.reingold.tilford(chap_net)
layout4 <- layout.fruchterman.reingold(chap_net)

#V(chap_net)$size <- Name_freq$freq/10
## 设置节点的字体
V(chap_net)$label.family <- "STKaiti"
E(chap_net)$width <- log10(E(chap_net)$weight) *2
egam <- (E(chap_net)$width) / max(E(chap_net)$width)
E(chap_net)$color <- rgb(1,0.5,0.5,egam)
V(chap_net)$size <- log(V(chap_net)$freq) * 2.5
plot(chap_net,layout = layout1,main = "《红楼梦》根据段落部分人物关系")

plot(chap_net,layout = layout2,main = "《红楼梦》根据段落部分人物关系")

plot(chap_net,layout = layout3,main = "《红楼梦》根据段落部分人物关系")

plot(chap_net,layout = layout4,main = "《红楼梦》根据段落部分人物关系")

## 按照权重2,即段落权重分析人物的社交网络####
## 分析链接次数较大的人物
# 1:准备社交网络数据
chap_net <- Red_net[Red_net$duanweight > 50,c(1,2,4)]
names(chap_net) <- c("from","to","weight")

chap_vert <- Name_freq[Name_freq$word %in% as.character(union(unique(chap_net$from),
                                                              unique(chap_net$to))),]

chap_net <- graph_from_data_frame(chap_net,directed = FALSE,
                                  vertices = chap_vert)
chap_net
## IGRAPH UNW- 24 70 -- 
## + attr: name (v/c), freq (v/n), weight (e/n)
## + edges (vertex names):
##  [1] 宝玉  --贾母   宝玉  --凤姐   宝玉  --袭人   宝玉  --王夫人
##  [5] 宝玉  --宝钗   宝玉  --贾政   宝玉  --贾琏   宝玉  --平儿  
##  [9] 宝玉  --薛姨妈 宝玉  --探春   宝玉  --紫鹃   宝玉  --鸳鸯  
## [13] 宝玉  --贾珍   宝玉  --李纨   宝玉  --尤氏   宝玉  --晴雯  
## [17] 宝玉  --邢夫人 宝玉  --林黛玉 宝玉  --麝月   宝玉  --惜春  
## [21] 宝玉  --迎春   贾母  --凤姐   贾母  --袭人   贾母  --王夫人
## [25] 贾母  --宝钗   贾母  --贾政   贾母  --贾琏   贾母  --薛姨妈
## [29] 贾母  --探春   贾母  --鸳鸯   贾母  --贾珍   贾母  --李纨  
## + ... omitted several edges
chap_net$name <- "《红楼梦》章节人物关系"
V(chap_net)$media
## NULL
## 节点数目
vcount(chap_net)
## [1] 24
## 边的数目
ecount(chap_net)
## [1] 70
## 简化网络图
chap_net <- simplify(chap_net,remove.multiple = TRUE,remove.loops = TRUE,
                     edge.attr.comb = "mean")

## 查看节点的度
degrees <- data.frame(name = names(degree(chap_net)),
                      counts = (degree(chap_net)))

ggplot(degrees,aes(x = reorder(name,counts),y = counts)) +
  theme_bw(base_size = 11,base_family = "STKaiti")+
  geom_bar(stat = "identity",position = "dodge",fill = "lightblue") +
  theme(axis.text.x = element_text(size = 5,hjust = 1,angle = 90,vjust = 0.5),
        axis.title.x = element_blank()) +
  labs(x = "人名",y = "节点的度",title = "《红楼梦》")

## 判断事否为联通图
is.connected(chap_net)
## [1] TRUE
## 计算图的直径
diameter(chap_net,directed = FALSE)
## [1] 289
## 
set.seed(1234)
par(cex = 0.8,family = "STKaiti")
## 设置图层
layout1 <- layout.lgl(chap_net)
layout2 <- layout.kamada.kawai(chap_net)
layout3 <- layout.reingold.tilford(chap_net)
layout4 <- layout.fruchterman.reingold(chap_net)

#V(chap_net)$size <- Name_freq$freq/10
## 设置节点的字体
V(chap_net)$label.family <- "STKaiti"
E(chap_net)$width <- log10(E(chap_net)$weight) *2
egam <- (E(chap_net)$width) / max(E(chap_net)$width)
E(chap_net)$color <- rgb(1,0.5,0.5,egam)
V(chap_net)$size <- log(V(chap_net)$freq) * 2.5
plot(chap_net,layout = layout1,main = "《红楼梦》根据段落部分人物关系")

plot(chap_net,layout = layout2,main = "《红楼梦》根据段落部分人物关系")

plot(chap_net,layout = layout3,main = "《红楼梦》根据段落部分人物关系")

plot(chap_net,layout = layout4,main = "《红楼梦》根据段落部分人物关系")

library(networkD3)
library(igraph)

# Basic Graph
chap_net <- Red_net[Red_net$duanweight > 40,c(1,2,4)]
g <- graph.data.frame(chap_net, directed=F) # raw graph


## Make a vertices df
vertices<-data.frame(
  name = V(g)$name,
  group = edge.betweenness.community(g)$membership,
  betweenness = (betweenness(g,directed=F,normalized=T)*115)+0.1 #so size isn't tiny
) 
#nb. can also adjust nodesize with `radiusCalculation`

# create indices (indexing needs to be JS format)
chap_net$source.index = match(chap_net$First, vertices$name)-1
chap_net$target.index = match(chap_net$Second, vertices$name)-1


# supply a edgelist + nodelist
d3 = forceNetwork(Links = chap_net, Nodes = vertices,
                  Source = 'source.index', Target = 'target.index',
                  NodeID = 'name',
                  Group = 'group', # color nodes by group calculated earlier
                  charge = -200, # node repulsion
                  linkDistance = 20,
                  zoom = T, 
                  opacity = 1,
                  fontSize=24)

show(d3)
# Basic Graph
chap_net <- Red_net[Red_net$duanweight > 60,c(1,2,4)]
g <- graph.data.frame(chap_net, directed=F) # raw graph


## Make a vertices df
vertices<-data.frame(
  name = V(g)$name,
  group = edge.betweenness.community(g)$membership,
  betweenness = (betweenness(g,directed=F,normalized=T)*115)+0.1 #so size isn't tiny
) 
#nb. can also adjust nodesize with `radiusCalculation`

# create indices (indexing needs to be JS format)
chap_net$source.index = match(chap_net$First, vertices$name)-1
chap_net$target.index = match(chap_net$Second, vertices$name)-1


# supply a edgelist + nodelist
d3 = forceNetwork(Links = chap_net, Nodes = vertices,
                  Source = 'source.index', Target = 'target.index',
                  NodeID = 'name',
                  Group = 'group', # color nodes by group calculated earlier
                  charge = -200, # node repulsion
                  linkDistance = 20,
                  zoom = T, 
                  opacity = 1,
                  fontSize=24)

show(d3)